Descrition of data cleaning:
reading in data
selecting and renaming the columns
since names are in last, first format, reorder them to be first last
make sure all “Jr”s go at the end
remove accents using stringi::stri_trans_general
salaries = read_excel(
"data/MLB-Salaries 2000-24.xlsx",
sheet = "2022.xls",
skip = 1) |>
select(1:4) |>
rename(position = "Pos'n",
salary_2022 = "2022.0",
service_time_yrs = "MLS",
name = "Player") %>%
mutate(name = str_split(name, ","),
name = map(name, rev),
name = map_chr(name, str_c, collapse = " "),
name = str_trim(name),
name = str_replace_all(name, "[*#\\.,]", ""),
junior = str_detect(name, "Jr"),
name = if_else(junior == TRUE, str_remove(name, " Jr"), name),
name = if_else(junior == TRUE, str_c(name, " Jr"), name),
name = stringi::stri_trans_general(name,id = "Latin-ASCII")) |>
select(-junior) %>%
mutate(simple_position = str_split_i(position, "-", 1),
simple_position = fct_relevel(simple_position,
c("c", "1b", "2b", "3b", "ss",
"lf", "cf", "rf", "inf", "of",
"dh", "rhp", "lhp")),
service_time_floor = floor(service_time_yrs))
## New names:
## • `` -> `...5`
Descrition of data cleaning:
reading in data and clean names
grouping by name and using a count to ensure we only end up with one of each name (slightly complicated due to interleague trades and potential for players with the same name)
remove whitespace, from names and use stringi again to remove accents so that the formatting exactly matches the salaries tibble
Filter out those with fewer than 100 plate appearances
batting = read_delim("data/2022 MLB Player Stats - Batting.csv", delim = ";",
locale = locale(encoding = "latin1")) |>
janitor::clean_names() %>%
group_by(name) |>
mutate(name_count = n(),
keep_row = case_when(name_count == 1 ~ TRUE,
name_count > 1 & tm == "TOT" ~ TRUE,
.default = FALSE)) |>
filter(keep_row == TRUE) %>%
mutate(name_count = n(),
keep_row = case_when(name_count == 1 ~ TRUE,
name_count > 1 & lg == "MLB" ~ TRUE,
.default = FALSE)) |>
ungroup() |>
filter(keep_row == TRUE) %>%
mutate(name = str_split(name, "\\s+"),
name = map_chr(name, str_c, collapse = " "),
name = str_trim(name),
name = str_replace_all(name, "[*#\\.]", ""),
name = stringi::stri_trans_general(name,id = "Latin-ASCII"))
## Rows: 992 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (3): Name, Tm, Lg
## dbl (26): Rk, Age, G, PA, AB, R, H, 2B, 3B, HR, RBI, SB, CS, BB, SO, BA, OBP...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
merged_batting <- salaries |>
mutate(simple_position = str_split_i(position, "-", 1)) |>
filter(!simple_position %in% c("rhp","lhp")) |>
inner_join(batting, by = "name") |>
filter(pa >= 100)
salary_not_in_batting <- anti_join(salaries, batting, by = "name")
batting_not_in_salary <- anti_join(batting, salaries, by = "name")
(same data cleaning process as above)
pitching = read_delim("data/2022 MLB Player Stats - Pitching.csv", delim = ";",
locale = locale(encoding = "latin1")) |>
janitor::clean_names() %>%
group_by(name) |>
mutate(name_count = n(),
keep_row = case_when(name_count == 1 ~ TRUE,
name_count > 1 & tm == "TOT" ~ TRUE,
.default = FALSE)) |>
filter(keep_row == TRUE) %>%
mutate(name_count = n(),
keep_row = case_when(name_count == 1 ~ TRUE,
name_count > 1 & lg == "MLB" ~ TRUE,
.default = FALSE)) |>
ungroup() |>
filter(keep_row == TRUE) %>%
mutate(name = str_split(name, "\\s+"),
name = map_chr(name, str_c, collapse = " "),
name = str_trim(name),
name = str_replace_all(name, "[*#\\.]", ""),
name = stringi::stri_trans_general(name,id = "Latin-ASCII"))
## Rows: 1081 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (3): Name, Tm, Lg
## dbl (32): Rk, Age, W, L, W-L%, ERA, G, GS, GF, CG, SHO, SV, IP, H, R, ER, HR...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
merged_pitching <- inner_join(salaries, pitching, by = "name") |>
filter(ip >= 20) |>
separate(ip, into = c("ip", "ip_dec"), remove = FALSE, convert = TRUE) |>
mutate(ip_dec_333 = ifelse(is.na(ip_dec), 0, ip_dec * 333) ,
ip_total = paste(ip, ip_dec_333, sep = "."),
ip_total = as.numeric(ip_total)) |>
select(-ip, -ip_dec, -ip_dec_333) |>
separate(position, into = c("hand", "pitcher_type")) |>
mutate(pitcher_type = ifelse(is.na(pitcher_type), "r", pitcher_type)) |>
filter(hand == "lhp" | hand == "rhp")
## Warning in inner_join(salaries, pitching, by = "name"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 325 of `x` matches multiple rows in `y`.
## ℹ Row 265 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 172 rows [5, 6, 10, 12,
## 14, 22, 24, 27, 30, 32, 33, 34, 36, 38, 40, 41, 42, 43, 45, 47, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 336 rows [24, 31, 34, 40,
## 44, 46, 49, 53, 55, 58, 59, 63, 65, 68, 70, 72, 77, 79, 81, 82, ...].
salary_not_in_pitching <- anti_join(salaries, pitching, by = "name")
pitching_not_in_salary <- anti_join(pitching, salaries, by = "name")
Note that there are a number of names in salaries not
found in batting or pitching, and vice versa.
Some of this may be genuine missingness (e.g. salaries has fewer rows
than pitching and batting combined), but some is also due to alternative
spellings of names in the datasets. Possibly could be fixed using other
matching methods, but the analysis pipeline will still work from
here.
Some Basic EDA for salaries on their own
salaries %>%
ggplot(aes(x=simple_position,y=salary_2022)) +
geom_boxplot()
## Warning: Removed 295 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
salaries %>%
ggplot(aes(x=floor(service_time_yrs),y=salary_2022)) +
geom_col()
## Warning: Removed 295 rows containing missing values or values outside the scale range
## (`geom_col()`).
salaries %>%
ggplot(aes(x=floor(service_time_yrs),fill = simple_position)) +
geom_bar(position = "fill")
salaries %>%
ggplot(aes(x=factor(floor(service_time_yrs)), y=salary_2022)) +
geom_boxplot()
## Warning: Removed 295 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Start with some descriptive statistics/plots
Service time (i.e. number of years of experience)
merged_batting |>
ggplot(aes(x = service_time_yrs)) +
geom_density() +
labs(x = "Service time")
This is left skewed, so may need to transform it if used in
regression
Salary
merged_batting |>
ggplot(aes(x = salary_2022)) +
geom_density() +
labs(x = "Salary")
## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_density()`).
This is also heavily left skewed, with many players having salary under
$1,000,000
Salary and service time are likely related, since rookies have contract minimums
merged_batting |>
ggplot(aes(x = service_time_yrs, y = salary_2022)) +
geom_point(alpha = 0.5)
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).
Position type
merged_batting |>
mutate(positions = str_split_i(position, "-",1)) |>
count(positions) |>
arrange(-n) |>
knitr::kable()
| positions | n |
|---|---|
| 2b | 64 |
| c | 59 |
| ss | 54 |
| 3b | 46 |
| 1b | 40 |
| of | 38 |
| cf | 33 |
| rf | 30 |
| lf | 25 |
| dh | 7 |
| inf | 2 |
Since we filtered on plate appearances, most players in the batting dataset are position players (i.e., not pitchers). If we use this in regression analysis, we may want to filter out pitchers entirely.
Looking at some hitting statistics
merged_batting |>
ggplot(aes(x = hr)) +
geom_histogram() +
labs(x = "Number of home runs (HRs)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
merged_batting |>
ggplot(aes(x = rbi)) +
geom_histogram() +
labs(x = "Runs batted in (RBI)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
These are also left skewed so will need to normalize for any linear model.
OPS is an “all in one” statistic combining on-base percentage (OBP) and slugging (SLG).
OBP is calculated as \(\frac{Hits (H) + Walks (BB) + Hit by pitch (HBP)}{At \ bats (AB) + Walks (BB) + sacrifice \ flies (SF) + Hit by pitch (HBP)}\)
Slugging is calculated as \(\frac{total \ bases (TB)}{At \ bats (AB)}\)
OPS is the sum of these two statistics.
OPS+ (or adjusted OPS) is adjusted for the park and league averages.
merged_batting |>
ggplot(aes(x = ops)) +
geom_histogram() +
labs(x = "OPS")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
merged_batting |>
ggplot(aes(x = ops_2)) +
geom_histogram() +
labs(x = "OPS+")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
One idea might be to compare OPS or OPS+ to more traditional statistics, like RBIs, HRs, or batting averages to OPS or OPS+ as single predictors of salary.
One other factor to consider is the correlation between some of these variables
merged_batting |>
select(hr, rbi, pa, ba, ops) |>
GGally::ggpairs()
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
Some of these potential predictors are fairly strongly correlated (like RBI and HR, or BA and OPS), so it’s important not to include to many collinear variables in a potential linear model.
Looking at some relationships to salary
Position vs. salary
merged_batting |>
mutate(positions = str_split_i(position, "-",1)) |>
ggplot(aes(x = positions,y= salary_2022)) +
geom_boxplot()
## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
merged_batting |>
mutate(positions = str_split_i(position, "-",1)) |>
group_by(positions) |>
summarize(avg_salary = mean(salary_2022, na.rm = TRUE),
median_salary = median(salary_2022, na.rm = TRUE))
## # A tibble: 11 × 3
## positions avg_salary median_salary
## <chr> <dbl> <dbl>
## 1 1b 8845755. 6225000
## 2 2b 5010094. 2550000
## 3 3b 8302384. 3375000
## 4 c 3669983. 1750000
## 5 cf 8145270. 5500000
## 6 dh 11857143. 12000000
## 7 inf 1312500 1312500
## 8 lf 6971404. 6125000
## 9 of 2211196. 746100
## 10 rf 7833411. 4875000
## 11 ss 7266837 4000000
HR vs. salary
merged_batting |>
ggplot(aes(x = hr, y = salary_2022)) +
geom_point()
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).
RBI vs. salary
merged_batting |>
ggplot(aes(x = rbi, y = salary_2022)) +
geom_point()
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).
BA vs. salary
merged_batting |>
ggplot(aes(x = ba, y = salary_2022)) +
geom_point()
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).
Plate appearances vs. salary
merged_batting |>
ggplot(aes(x = pa, y = salary_2022)) +
geom_point()
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).
OPS+ vs salary
merged_batting |>
ggplot(aes(x = ops_2, y = salary_2022)) +
geom_point()
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).
In order to eliminate pitchers with very few appearances or position players (catchers, first basemen, etc.) who came in to pitch in blowouts, I filtered only for pitchers with at least 10 innings pitches.
merged_pitching |>
ggplot(aes(x = ip_total)) +
geom_histogram(binwidth = 2, fill = "lightblue", col = "black")
merged_pitching |>
ggplot(aes(x = era_2)) +
geom_histogram(fill = "lightblue", col = "black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
merged_pitching |>
ggplot(aes(x = ip_total, y = salary_2022)) +
geom_point() +
scale_x_continuous(lim = c(0, 400))
## Warning: Removed 50 rows containing missing values or values outside the scale range
## (`geom_point()`).
merged_pitching |>
ggplot(aes(x = era_2, y = salary_2022)) +
geom_point() +
scale_x_continuous(lim = c(0, 400))
## Warning: Removed 51 rows containing missing values or values outside the scale range
## (`geom_point()`).
merged_pitching |>
ggplot(aes(x = ip_total, y = log(salary_2022))) +
geom_point() +
scale_x_continuous(lim = c(0, 400))
## Warning: Removed 50 rows containing missing values or values outside the scale range
## (`geom_point()`).
merged_pitching |>
ggplot(aes(x = era_2, y = log(salary_2022))) +
geom_point() +
scale_x_continuous(lim = c(0, 400))
## Warning: Removed 51 rows containing missing values or values outside the scale range
## (`geom_point()`).
lm_ip = lm(salary_2022 ~ ip_total, data = merged_pitching)
summary(lm_ip)
##
## Call:
## lm(formula = salary_2022 ~ ip_total, data = merged_pitching)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7881584 -2346730 -1339157 542534 36868673
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 453710 505732 0.897 0.37
## ip_total 41360 5280 7.833 4e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5394000 on 417 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 0.1283, Adjusted R-squared: 0.1262
## F-statistic: 61.35 on 1 and 417 DF, p-value: 4.002e-14
lm_ip_data = tibble(fitted.values(lm_ip), residuals(lm_ip))
names(lm_ip_data) = c("fitted", "resid")
lm_ip_data |>
ggplot(aes(x = fitted, y = resid)) +
geom_point()
view(lm_ip_data)
lm_era_2 = lm(salary_2022 ~ era_2, data = merged_pitching)
summary(lm_era_2)
##
## Call:
## lm(formula = salary_2022 ~ era_2, data = merged_pitching)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4017039 -3088950 -2721275 385077 39372391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3560272 674783 5.276 2.12e-07 ***
## era_2 2371 5297 0.448 0.655
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5776000 on 417 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 0.0004801, Adjusted R-squared: -0.001917
## F-statistic: 0.2003 on 1 and 417 DF, p-value: 0.6547
lm_era_2_data = tibble(fitted.values(lm_era_2), residuals(lm_era_2))
names(lm_era_2_data) = c("fitted", "resid")
lm_era_2_data |>
ggplot(aes(x = fitted, y = resid)) +
geom_point()
view(lm_era_2_data)
lm_ip_log = lm(log(salary_2022) ~ ip_total, data = merged_pitching)
summary(lm_ip_log)
##
## Call:
## lm(formula = log(salary_2022) ~ ip_total, data = merged_pitching)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8882 -0.7565 -0.4034 0.7288 3.1811
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.776177 0.097656 141.069 < 2e-16 ***
## ip_total 0.008032 0.001020 7.877 2.93e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.042 on 417 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 0.1295, Adjusted R-squared: 0.1274
## F-statistic: 62.05 on 1 and 417 DF, p-value: 2.933e-14
lm_ip_log_data = tibble(fitted.values(lm_ip_log), residuals(lm_ip_log))
names(lm_ip_log_data) = c("fitted", "resid")
lm_ip_log_data |>
ggplot(aes(x = fitted, y = resid)) +
geom_point()
view(lm_ip_log_data)
lm_era_2_log = lm(log(salary_2022) ~ era_2, data = merged_pitching)
summary(lm_era_2_log)
##
## Call:
## lm(formula = log(salary_2022) ~ era_2, data = merged_pitching)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9979 -0.9559 -0.5614 0.8333 3.1417
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.441e+01 1.304e-01 110.497 <2e-16 ***
## era_2 1.869e-04 1.024e-03 0.183 0.855
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.116 on 417 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 7.994e-05, Adjusted R-squared: -0.002318
## F-statistic: 0.03334 on 1 and 417 DF, p-value: 0.8552
lm_era_2_data_log = tibble(fitted.values(lm_era_2_log), residuals(lm_era_2_log))
names(lm_era_2_data_log) = c("fitted", "resid")
lm_era_2_data_log |>
ggplot(aes(x = fitted, y = resid)) +
geom_point()
lm_pitching = lm(salary_2022 ~ hand + pitcher_type + ip_total + era_2,
data = merged_pitching)
summary(lm_pitching)
##
## Call:
## lm(formula = salary_2022 ~ hand + pitcher_type + ip_total + era_2,
## data = merged_pitching)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8526308 -1750980 -765623 513642 34843562
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8090576 2079374 3.891 0.000116 ***
## handrhp -457404 538042 -0.850 0.395747
## pitcher_typer -7479814 1909520 -3.917 0.000105 ***
## pitcher_types -2238603 1982913 -1.129 0.259576
## ip_total 16550 5768 2.870 0.004322 **
## era_2 4082 4633 0.881 0.378782
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4948000 on 413 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 0.2737, Adjusted R-squared: 0.2649
## F-statistic: 31.12 on 5 and 413 DF, p-value: < 2.2e-16